{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 1: 导入使用的库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ChineseNlpCorpus']\n",
      "['hfl']\n",
      "['super_glue', 'smape', 'mase', 'charcut_mt', 'wiki_split', 'mean_iou', 'ter', 'bertscore', 'glue', 'bleurt', 'frugalscore', 'matthews_correlation', 'cer', 'rl_reliability', 'xtreme_s', 'perplexity', 'sari', 'character', 'mahalanobis', 'seqeval', 'indic_glue', 'meteor', 'google_bleu', 'squad_v2', 'competition_math', 'squad', 'nist_mt', 'r_squared', 'confusion_matrix', 'roc_auc', 'f1', 'recall', 'mape', 'chrf', 'pearsonr', 'spearmanr', 'sacrebleu', 'wer', 'xnli', 'mse', 'exact_match', 'mae', 'coval', 'code_eval', 'mauve', 'trec_eval', 'cuad', 'poseval', 'brier_score', 'precision', 'comet', 'accuracy', 'bleu', 'rouge']\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments\n",
    "import os\n",
    "from torch.utils.data import DataLoader, Dataset, random_split\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "data_dir = os.path.expanduser(\"~/datasets/\")\n",
    "model_dir = os.path.expanduser(\"~/models/\")\n",
    "evaluate_dir = os.path.expanduser(\"~/.cache/huggingface/evaluate/metrics\")\n",
    "print(os.listdir(data_dir))\n",
    "print(os.listdir(model_dir))\n",
    "print(os.listdir(evaluate_dir))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 2: 加载和处理数据并创建dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7765\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_name = r'ChineseNlpCorpus/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv'\n",
    "data_path = os.path.join(data_dir, data_name)\n",
    "dataset = load_dataset(\"csv\", data_files=data_path, split='train')\n",
    "dataset = dataset.filter(lambda example: example[\"review\"] is not None and example['label'] is not None)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': 1, 'review': '距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较为简单.'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(iter(dataset))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 划分数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 6988\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 777\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datasets = dataset.train_test_split(test_size=0.1)\n",
    "datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a8974a9be09a408eb840353152548dec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bd260b9ae8e34850b53aca1d9444ac44",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 6988\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 777\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_name = r\"hfl/rbt3\"\n",
    "model_path = os.path.join(model_dir, model_name)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
    "def process_function(examples):\n",
    "    tokenized_example = tokenizer(examples[\"review\"], max_length=128, truncation=True)\n",
    "    tokenized_example[\"labels\"] = examples['label']\n",
    "    return tokenized_example\n",
    "\n",
    "batch_size = 32\n",
    "tokenized_datasets = datasets.map(process_function, batched=True, batch_size=batch_size, remove_columns=datasets['train'].column_names)\n",
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step4: 创建evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "acc_metrics = evaluate.load(os.path.join(evaluate_dir, \"accuracy\"))\n",
    "f1_metrics = evaluate.load(os.path.join(evaluate_dir, \"f1\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def eval_metrics(eval_predict):\n",
    "    predictions, labels = eval_predict\n",
    "    predictions = predictions.argmax(axis=-1)\n",
    "    acc = acc_metrics.compute(predictions=predictions, references=labels)\n",
    "    f1 = f1_metrics.compute(predictions=predictions, references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step5: 创建trainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/sxliu/models/hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label={0: \"差评!\", 1: \"好评!\"}, num_labels = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainingArguments(\n",
       "_n_gpu=2,\n",
       "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
       "adafactor=False,\n",
       "adam_beta1=0.9,\n",
       "adam_beta2=0.999,\n",
       "adam_epsilon=1e-08,\n",
       "auto_find_batch_size=False,\n",
       "batch_eval_metrics=False,\n",
       "bf16=False,\n",
       "bf16_full_eval=False,\n",
       "data_seed=None,\n",
       "dataloader_drop_last=False,\n",
       "dataloader_num_workers=0,\n",
       "dataloader_persistent_workers=False,\n",
       "dataloader_pin_memory=True,\n",
       "dataloader_prefetch_factor=None,\n",
       "ddp_backend=None,\n",
       "ddp_broadcast_buffers=None,\n",
       "ddp_bucket_cap_mb=None,\n",
       "ddp_find_unused_parameters=None,\n",
       "ddp_timeout=1800,\n",
       "debug=[],\n",
       "deepspeed=None,\n",
       "disable_tqdm=False,\n",
       "dispatch_batches=None,\n",
       "do_eval=True,\n",
       "do_predict=False,\n",
       "do_train=False,\n",
       "eval_accumulation_steps=None,\n",
       "eval_delay=0,\n",
       "eval_do_concat_batches=True,\n",
       "eval_on_start=False,\n",
       "eval_steps=None,\n",
       "eval_strategy=epoch,\n",
       "eval_use_gather_object=False,\n",
       "evaluation_strategy=None,\n",
       "fp16=False,\n",
       "fp16_backend=auto,\n",
       "fp16_full_eval=False,\n",
       "fp16_opt_level=O1,\n",
       "fsdp=[],\n",
       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
       "fsdp_min_num_params=0,\n",
       "fsdp_transformer_layer_cls_to_wrap=None,\n",
       "full_determinism=False,\n",
       "gradient_accumulation_steps=1,\n",
       "gradient_checkpointing=False,\n",
       "gradient_checkpointing_kwargs=None,\n",
       "greater_is_better=True,\n",
       "group_by_length=False,\n",
       "half_precision_backend=auto,\n",
       "hub_always_push=False,\n",
       "hub_model_id=None,\n",
       "hub_private_repo=False,\n",
       "hub_strategy=every_save,\n",
       "hub_token=<HUB_TOKEN>,\n",
       "ignore_data_skip=False,\n",
       "include_for_metrics=[],\n",
       "include_inputs_for_metrics=False,\n",
       "include_num_input_tokens_seen=False,\n",
       "include_tokens_per_second=False,\n",
       "jit_mode_eval=False,\n",
       "label_names=None,\n",
       "label_smoothing_factor=0.0,\n",
       "learning_rate=5e-05,\n",
       "length_column_name=length,\n",
       "load_best_model_at_end=True,\n",
       "local_rank=0,\n",
       "log_level=passive,\n",
       "log_level_replica=warning,\n",
       "log_on_each_node=True,\n",
       "logging_dir=./checkpoints/runs/Nov03_15-50-37_cu01,\n",
       "logging_first_step=False,\n",
       "logging_nan_inf_filter=True,\n",
       "logging_steps=12,\n",
       "logging_strategy=steps,\n",
       "lr_scheduler_kwargs={},\n",
       "lr_scheduler_type=linear,\n",
       "max_grad_norm=1.0,\n",
       "max_steps=-1,\n",
       "metric_for_best_model=f1,\n",
       "mp_parameters=,\n",
       "neftune_noise_alpha=None,\n",
       "no_cuda=False,\n",
       "num_train_epochs=3.0,\n",
       "optim=adamw_torch,\n",
       "optim_args=None,\n",
       "optim_target_modules=None,\n",
       "output_dir=./checkpoints,\n",
       "overwrite_output_dir=False,\n",
       "past_index=-1,\n",
       "per_device_eval_batch_size=64,\n",
       "per_device_train_batch_size=32,\n",
       "prediction_loss_only=False,\n",
       "push_to_hub=False,\n",
       "push_to_hub_model_id=None,\n",
       "push_to_hub_organization=None,\n",
       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
       "ray_scope=last,\n",
       "remove_unused_columns=True,\n",
       "report_to=['tensorboard'],\n",
       "restore_callback_states_from_checkpoint=False,\n",
       "resume_from_checkpoint=None,\n",
       "run_name=./checkpoints,\n",
       "save_on_each_node=False,\n",
       "save_only_model=False,\n",
       "save_safetensors=True,\n",
       "save_steps=500,\n",
       "save_strategy=epoch,\n",
       "save_total_limit=3,\n",
       "seed=42,\n",
       "skip_memory_metrics=True,\n",
       "split_batches=None,\n",
       "tf32=None,\n",
       "torch_compile=False,\n",
       "torch_compile_backend=None,\n",
       "torch_compile_mode=None,\n",
       "torch_empty_cache_steps=None,\n",
       "torchdynamo=None,\n",
       "tpu_metrics_debug=False,\n",
       "tpu_num_cores=None,\n",
       "use_cpu=False,\n",
       "use_ipex=False,\n",
       "use_legacy_prediction_loop=False,\n",
       "use_liger_kernel=False,\n",
       "use_mps_device=False,\n",
       "warmup_ratio=0.0,\n",
       "warmup_steps=0,\n",
       "weight_decay=0.0,\n",
       ")"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_args = TrainingArguments(output_dir=\"./checkpoints\",\n",
    "                               per_device_train_batch_size=32,\n",
    "                               per_device_eval_batch_size=64,\n",
    "                               logging_steps=12,\n",
    "                               eval_strategy='epoch',\n",
    "                               save_strategy=\"epoch\",\n",
    "                               save_total_limit=3,\n",
    "                               metric_for_best_model=\"f1\",\n",
    "                               load_best_model_at_end=True)\n",
    "train_args"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    }
   ],
   "source": [
    "trainer = Trainer(model=model,\n",
    "                  args=train_args,\n",
    "                  train_dataset=tokenized_datasets[\"train\"],\n",
    "                  eval_dataset=tokenized_datasets[\"test\"],\n",
    "                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                  compute_metrics=eval_metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='330' max='330' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [330/330 00:50, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>F1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.034200</td>\n",
       "      <td>0.847543</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.894587</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.013000</td>\n",
       "      <td>0.938506</td>\n",
       "      <td>0.863578</td>\n",
       "      <td>0.899811</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>0.902526</td>\n",
       "      <td>0.868726</td>\n",
       "      <td>0.905028</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=330, training_loss=0.01481244115328247, metrics={'train_runtime': 50.6603, 'train_samples_per_second': 413.815, 'train_steps_per_second': 6.514, 'total_flos': 351909933963264.0, 'train_loss': 0.01481244115328247, 'epoch': 3.0})"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/sxliu/models/hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label={0: \"差评!\", 1: \"好评!\"}, num_labels = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "epochs = 2\n",
    "\n",
    "def evaluate():\n",
    "    model.eval()\n",
    "    acc_num = 0\n",
    "    with torch.inference_mode():\n",
    "        for batch in val_dataloader:\n",
    "            if torch.cuda.is_available():\n",
    "                batch = {k: v.cuda() for k, v in batch.items()}\n",
    "            outputs = model(**batch)\n",
    "            pred = torch.argmax(outputs.logits, dim=-1)\n",
    "            acc_num += (pred.long() == batch[\"labels\"].long()).float().sum()\n",
    "    return acc_num / len(val_dataset)\n",
    "\n",
    "def train(epochs, log_step=100):\n",
    "    global_step = 0\n",
    "    for epoch in range(epochs):\n",
    "        model.train()\n",
    "        for batch in train_dataloader:\n",
    "            batch = {k: v.cuda() for k, v in batch.items()}\n",
    "            optimazer.zero_grad()\n",
    "            outputs = model(**batch)\n",
    "            outputs.loss.backward()\n",
    "            optimazer.step()\n",
    "\n",
    "            if global_step % log_step == 0:\n",
    "                print(f\"##epoch {epoch} loss is {outputs.loss.item():.4f}\")\n",
    "\n",
    "            global_step += 1\n",
    "        acc = evaluate()\n",
    "        print(f\"##epoch {epoch} val acc is {acc:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##epoch 0 loss is 0.6460\n",
      "##epoch 0 loss is 0.1787\n",
      "##epoch 0 loss is 0.3322\n",
      "##epoch 0 val acc is 0.8983\n",
      "##epoch 1 loss is 0.2462\n",
      "##epoch 1 loss is 0.2407\n",
      "##epoch 1 val acc is 0.9022\n"
     ]
    }
   ],
   "source": [
    "train(epochs, log_step=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "输入: 包装压坏了，里面的东西也坏了，怎么办\n",
      "模型输出结果: 差评!\n"
     ]
    }
   ],
   "source": [
    "sentence = \"包装压坏了，里面的东西也坏了，怎么办\"\n",
    "model.eval()\n",
    "with torch.inference_mode():\n",
    "    inputs = tokenizer(sentence, return_tensors='pt')\n",
    "    if torch.cuda.is_available():\n",
    "        inputs = {k: v.cuda() for k, v in inputs.items()}\n",
    "    logits = model(**inputs).logits\n",
    "    pred = torch.argmax(logits, axis=-1)\n",
    "    print(f\"输入: {sentence}\\n模型输出结果: {model.config.id2label[pred.item()]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "pipe = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "397\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'label': '差评!', 'score': 0.5240045189857483}]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# max_position_embedding = 512, 所以输入序列最长不能超过512\n",
    "sentence = \"包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评，包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给,包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评，包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包装压坏了，里面的东西也坏了，怎么办，你处理的好的话，我就给你好评包\"\n",
    "print(len(sentence))\n",
    "pipe(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch2.4",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
